/*****************************************************************
* Do File: clean_metro.do
*
* Purpose:
*   This file takes as input the clean_census_tract.dta, race data from 
*   Census/ACS, industry employment shares from BLS, and the gini 
*   computed at the metro level. It then creates a combined dataset 
*   of variables at the metro level.
*
* Variables Created (Labels Only):
*
*   "Census Year"
*   "Metro Code"
*   "Metro Name"
*   "Dissim, All Family"
*   "Dissim, Families with kids"
*   "Dissim, Families without kids"
*   "Gini, Families with kids"
*   "Gini, Families without kids"
*   "Gini, All Families"
*   "Gini, Aggregate Weighted Fam Kids"
*   "Gini, Aggregate Uneighted Fam Kids"
*   "Gini, Aggregate Weighted Fam NoKids"
*   "Gini, Aggregate Uneighted Fam NoKids"
*   "Gini, Aggregate Weighted AllFam"
*   "Gini, Aggregate Uneighted AllFam"
*   "White, Population Share"
*   "Black, Population Share"
*   "Indian, Population Share"
*   "Asian, Population Share"
*   "Other, Population Share"
*   "Indicator, MSA Dissim"
*   "Indicator, MSA Race"
*
*   "Agriculture, forestry, fishing and hunting"
*   "Construction"
*   "Real estate & Rental and Leasing / Finance and Insurance"
*   "Manufacturing"
*   "Mining, quarrying, and oil and gas extraction"
*   "Unclassified"
*   "Public administration"
*   "Retail trade"
*   "Educational services; Accommodation and food services; Other services"
*   "Transportation and warehousing; Utilities"
*   "Wholesale trade"
*
*****************************************************************/






********************************************************************************
**** Families with children
********************************************************************************
use "$output/clean_census_tract.dta", clear
keep if ind_children==1
drop if f_count_ct_kids == .
keep year metro countyfips fips bracket_no f_count_ct_kids
tempfile kids
save `kids'

collapse (sum) f_count_ct_kids, by(year metro bracket_no)
bysort year metro (bracket_no): egen total = sum(f_count_ct_kids)
gen share = f_count_ct_kids / total
* Calculate the cumulative share to identify percentiles
bysort year metro (bracket_no) :  gen cdf = sum(share)
* Find the income bucket that is closest to the 80th percentile
gen flag1 = abs(cdf - 0.8)
bysort year metro (bracket_no) :  egen flagp = min(flag1)
gen flagc = flagp == flag1
* Identify census fips in the top 20%
gen negbrack = -bracket_no
bysort year metro (negbrack) :  gen flagc2 = sum(flagc)
gen id = flagc2 == 0
keep year metro bracket id
* Merge metro level cut-offs into the overall sample
merge 1:m year metro bracket_no using `kids'
* Calculate the total number of households in the rich and poor neighborhoods
* by census fips
collapse (sum) f_count_ct_kids, by(year metro fips id)
bysort year metro id : egen denom = sum(f_count_ct_kids)
* Generate the dissimilarity ratio and add up
gen ratio = f_count_ct_kids / denom * (1 - 2*(id == 1))
collapse (sum) ratio, by(year metro fips)
replace ratio = abs(ratio)/2
collapse (sum) ratio, by(year metro)

rename ratio famkids
tempfile famkids
save `famkids'



*******************************************************************************
**** Families without children
********************************************************************************
use "$output/clean_census_tract.dta", clear
keep if ind_children==1
drop if f_count_ct_wokids == .

keep year metro countyfips fips bracket_no f_count_ct_wokids
tempfile nokids
save `nokids'

collapse (sum) f_count_ct_wokids, by(year metro bracket_no)
bysort year metro (bracket_no): egen total = sum(f_count_ct_wokids)
gen share = f_count_ct_wokids / total
bysort year metro (bracket_no) :  gen cdf = sum(share)
gen flag1 = abs(cdf - 0.8)
bysort year metro (bracket_no) :  egen flagp = min(flag1)
gen flagc = flagp == flag1
gen negbrack = -bracket_no
bysort year metro (negbrack) :  gen flagc2 = sum(flagc)
gen id = flagc2 == 0
keep year metro bracket id
merge 1:m year metro bracket_no using `nokids'
collapse (sum) f_count_ct_wokids, by(year metro fips id)
bysort year metro id : egen denom = sum(f_count_ct_wokids)
gen ratio = f_count_ct_wokids / denom * (1 - 2*(id == 1))
collapse (sum) ratio, by(year metro fips)
replace ratio = abs(ratio)/2
collapse (sum) ratio, by(year metro)

rename ratio FamNoKids
label var FamNoKids "Families without children"

merge 1:1 year metro using `famkids'
drop _merge 
save `famkids', replace


********************************************************************************
**** All Families
********************************************************************************
use "$output/clean_census_tract.dta", clear
keep if ind_base==1
drop if f_count_ct == .
keep year metro countyfips fips bracket_no f_count_ct
tempfile kids
save `kids'

collapse (sum) f_count_ct, by(year metro bracket_no)
bysort year metro (bracket_no): egen total = sum(f_count_ct)
gen share = f_count_ct/ total
* Calculate the cumulative share to identify percentiles
bysort year metro (bracket_no) :  gen cdf = sum(share)
* Find the income bucket that is closest to the 80th percentile
gen flag1 = abs(cdf - 0.8)
bysort year metro (bracket_no) :  egen flagp = min(flag1)
gen flagc = flagp == flag1
* Identify census fips in the top 20%
gen negbrack = -bracket_no
bysort year metro (negbrack) :  gen flagc2 = sum(flagc)
gen id = flagc2 == 0
keep year metro bracket id
* Merge metro level cut-offs into the overall sample
merge 1:m year metro bracket_no using `kids'
* Calculate the total number of households in the rich and poor neighborhoods
* by census fips
collapse (sum) f_count_ct, by(year metro fips id)
bysort year metro id : egen denom = sum(f_count_ct)
* Generate the dissimilarity ratio and add up
gen ratio = f_count_ct/ denom * (1 - 2*(id == 1))
collapse (sum) ratio, by(year metro fips)
replace ratio = abs(ratio)/2
collapse (sum) ratio, by(year metro)

rename ratio allfam

merge 1:1 year metro using `famkids'
drop _merge 


rename allfam allfamdissim 
rename famkids famkidsdissim 
rename FamNoKids famnokidsdissim

********
** Save the combined metro level dissim data
gen ind_msa_dissim = 1 
save "$output/metro_dissim_all.dta", replace 







*******************************************************
* Race
*******************************************************




use  "$path\Raw_Data\2003_crosswalk_all_id_bc.dta", clear
tempfile crosswalk
save `crosswalk', replace


* Define the years to process 
local years "1980 1990 2000 2010"


tempfile master

local first = 1

foreach yr of local years {
    di "Processing year `yr'"
    
    local path = "$path/Raw_Data/Social Explorer/Race/`yr'/"
    
     if "`yr'"=="1980" {
         local file = "`path'R12837912_SL11.txt"
    }
	else if "`yr'"=="1990" {
         local file = "`path'R12837899_SL050.txt"
    }
    else if "`yr'"=="2000" {
         local file = "`path'R12837900_SL050.txt"
    }
    else {
         local file = "`path'R12837901_SL050.txt"
    }
    
	import delimited using "`file'", delimiter(tab) clear
	
	foreach var of varlist _all {
		if substr("`var'", 1, 4) == "geo_" {
			local newname = substr("`var'", 5,. )
			rename `var' `newname'
		}
	}
	
	gen countyfips = state * 1000 + county
	
	replace countyfips = 12086 if countyfips == 12025
	
	drop if state == 72
	
	gen year = `yr'
	
	merge m:1 countyfips using `crosswalk', keepusing(metro)
	keep if _merge == 3
	drop _merge 
	
	ds se_*
	local sevars `r(varlist)'

	foreach var of local sevars {
    // Extract the last 3 characters after the last underscore
		local suffix = substr("`var'", -3, .)
		rename `var' ct`suffix'
	}
	
	ds year countyfips metro ct*

	
	collapse (sum) ct*, by(year metro)
	
	if inlist(`yr', 1980, 1990) {
		gen white = ct002/ct001
		gen black = ct003/ct001
		gen indian = ct004/ct001
		gen asian = ct005/ct001
		gen other = ct006/ct001
	}
	
	else if inlist(`yr', 2000, 2010) {
		gen white = ct002/ct001
		gen black = ct003/ct001
		gen indian = ct004/ct001
		gen asian = (ct005 + ct006)/ct001
		gen other = (ct007 + ct008)/ct001
	}
	
	else {
		de in red "Year `yr' should be one of 1980, 1990, 2000, or 2010"
		exit 1
	}
	
	keep year metro white black indian asian other
	
	if `first' {
		save `master', replace
		local first = 0
	}
	else {
		append using `master'
		save `master', replace
	}

}
	
use `master', clear
sort year metro
gen ind_msa_race=1
save "$output/metro_level_race.dta", replace

    

	
	
	
	
	

*******************************************************
* BLS 1980 DATA
*******************************************************
use "$path/Raw_Data/BLS/BLS_1980.dta", clear

keep if agglvl_code == 28

gen countyfips = real(area_fips)
replace countyfips = 12086 if countyfips == 12025

merge m:1 countyfips using "$path\Raw_Data\2003_crosswalk_all_id_bc.dta", keepusing(metro)
keep if _merge == 3
drop _merge


collapse (sum) annual_avg_emplvl, by(metro industry_title)
bysort metro: egen employment = total(annual_avg_emplvl)
gen employment_share = annual_avg_emplvl / employment


egen ind = group(industry_title), missing


tempfile bls80_temp
save `bls80_temp', replace

keep industry_title ind
duplicates drop
tempfile ind_codes
save `ind_codes', replace

use `bls80_temp', clear
keep metro employment_share ind
rename employment_share emp_share 

sort metro ind
reshape wide emp_share, i(metro) j(ind)

* Replace missing with 0
foreach var of varlist emp_share* {
    replace `var' = 0 if missing(`var')
}

gen year = 1980
gen ind_bls80 = 1


tempfile bls80
save `bls80', replace	
	
	
*******************************************************
* 2. BLS 2010 DATA
*******************************************************

use "$path/Raw_Data/BLS/BLS_2010.dta", clear

keep if agglvl_code == 74

gen countyfips = real(area_fips)
replace countyfips = 12086 if countyfips == 12025

merge m:1 countyfips using "$path\Raw_Data\2003_crosswalk_all_id_bc.dta", keepusing(metro)
keep if _merge == 3
drop _merge

decode industry_title, gen(industry_title_str)
drop industry_title
rename industry_title_str industry_title

gen ind =. 
replace ind = 1 if industry_title == "Agriculture, forestry, fishing and hunting"
replace ind = 2 if industry_title == "Construction"
replace ind = 3 if inlist(industry_title, "Real estate and rental and leasing", "Finance and insurance")
replace ind = 4 if industry_title == "Manufacturing"
replace ind = 5 if industry_title == "Mining, quarrying, and oil and gas extraction"
replace ind = 6 if industry_title == "Unclassified"
replace ind = 7 if industry_title == "Public administration"
replace ind = 8 if industry_title == "Retail trade"
replace ind = 9  if inlist(industry_title, "Educational services", "Accommodation and food services", ///
                            "Other services, except public administration", "Administrative and waste services", ///
                            "Arts, entertainment, and recreation", "Professional and technical services", ///
                            "Management of companies and enterprises", "Information", ///
                            "Health care and social assistance")
replace ind = 10 if inlist(industry_title, "Transportation and warehousing", "Utilities")
replace ind = 11 if industry_title == "Wholesale trade"

collapse (sum) annual_avg_emplvl, by(metro ind)
bysort metro: egen emplyment = total(annual_avg_emplvl)
gen employment_share = annual_avg_emplvl / emplyment

merge m:1 ind using `ind_codes'
drop _merge

keep metro employment_share ind
rename employment_share emp_share 
sort metro ind
reshape wide emp_share, i(metro) j(ind)
* Replace missing with 0
foreach var of varlist emp_share* {
    replace `var' = 0 if missing(`var')
}

gen year = 2010
gen ind_bls10=1


tempfile bls10
save `bls10', replace	
	
	
use `bls80', clear
append using `bls10'
tempfile bls_all
save `bls_all', replace
	
	





** Combining measures at the MSA level 
use "$output/metro_dissim_all.dta", clear 
merge 1:1 year metro using "$output/metro_gini_all.dta"
keep if _merge==3
drop _merge
merge 1:1 year metro using "$output/metro_level_race.dta"
drop _merge
merge 1:1 year metro using  "$output/popweight.dta"
drop _merge
merge 1:1 year metro using  `bls_all'
drop _merge



preserve
use  "$path\Raw_Data\2003_crosswalk_all_id_bc.dta", clear
keep metro metro_str 
duplicates drop metro metro_str, force 
tempfile msa_name 
save `msa_name', replace 
restore 

merge m:1 metro using  `msa_name'
drop _merge

sort year metro  
order year metro metro_str population *dissim *gini  



** Label all the variables 
label variable year "Census Year"
label variable metro "Metro Code"
label variable metro_str "Metro Name"
label variable allfamdissim "Dissim, All Family"
label variable famkidsdissim "Dissim, Families with kids"
label variable famnokidsdissim "Dissim, Families without kids"
label variable famkidsgini "Gini, Families with kids"
label variable famnokidsgini "Gini, Families without kids"
label variable allfamgini "Gini, All Families"
label variable famkids_weighted "Gini, Aggregate Weighted Fam Kids"
label variable famkids_unweighted "Gini, Aggregate Uneighted Fam Kids"
label variable fam_nokids_weighted "Gini, Aggregate Weighted Fam NoKids"
label variable fam_nokids_unweighted "Gini, Aggregate Uneighted Fam NoKids"
label variable all_fam_weighted "Gini, Aggregate Weighted AllFam"
label variable all_fam_unweighted "Gini, Aggregate Uneighted AllFam"
label variable white "White, Population Share"
label variable black "Black, Population Share"
label variable indian "Indian, Population Share"
label variable asian "Asian, Population Share"
label variable other "Other, Population Share"
label variable ind_msa_dissim "Indicator, MSA Dissim"
label variable ind_msa_dissim "Indicator, MSA Race"


* Label the emp_share variables based on the industry codes:

label variable emp_share1 "Agriculture, forestry, fishing and hunting"
label variable emp_share2 "Construction"
label variable emp_share3 "Real estate & Rental and Leasing / Finance and Insurance"
label variable emp_share4 "Manufacturing"
label variable emp_share5 "Mining, quarrying, and oil and gas extraction"
label variable emp_share6 "Unclassified"
label variable emp_share7 "Public administration"
label variable emp_share8 "Retail trade"
label variable emp_share9 "Educational services; Accommodation and food services; Other services"
label variable emp_share10 "Transportation and warehousing; Utilities"
label variable emp_share11 "Wholesale trade"


label variable ind_msa_gini "Indicator for gini data"
label variable ind_msa_dissim "Indicator for dissim data"
label variable ind_msa_race "Indicator for race data"
label variable ind_bls80 "Indicator for BLS 1980 data"
label variable ind_bls10 "Indicator for BLS 2010 data"



save "$output/metro_level.dta", replace



** Remove the intermediate files 
rm "$output/metro_dissim_all.dta"
rm "$output/metro_level_race.dta"












